/*
********************************************************************************
Description:		Additional results for Appendix C

Uses:				"${mergedata}\CouldhaveSample.dta"
					"${mergedata}\CouldhaveSample_matchedOnSurvives.dta"

Saves:				"${results}\TableC1.csv"
					"${results}\TableC2.csv"
					"${results}\TableC3.csv"
					"${results}\TableC4.csv"
					"${results}\FigureC1.csv"
					"${results}\FigureC2.csv"
					"${results}\FigureC3.csv"

********************************************************************************
*/



**********
*TABLE C1*
**********
*Balancing test for matched sample
runBalancingTests cpi using "$mergedata\CouldhaveSample_matchedOnSurvives.dta", saving("${results}\TableC1.csv") trimlevel(1) ctrlsurvives(0)


**********
*TABLE C2*
**********
*Dynamic effects using different control groups
*Table 2 Col 1 is produced as part of Table C3 below (see results for trim=1)
*Here we produce Table 2 Col 2, using stratified random sample for the control group

preserve 
use "${mergedata}\CouldhaveSample_matchedOnSurvives.dta"

egen actuallyaudited = max((yrssince == 0) & (status_ind == "S")), by(utr_no audityear)
cap file close resultsfh
file open resultsfh using "${results}\TableC2.csv", write text replace
	directctrl2sls, outcome(it_cl4_cgt_cpi) ctrlvars() trimlevel(1) fh(resultsfh) header printtoscreen
file close resultsfh
restore 



**********
*TABLE C3*
**********
*repeat main spec using 0 trimming, 0.5 trimming, and 2.5 trimming, as well as replicating main spec from paper 

preserve
use "${mergedata}\CouldhaveSample.dta", clear

file open resultsfh using "${results}\TableC3.csv", write text replace
local firsttime = 1
foreach trimlevel of numlist 0 0.5 1 2.5 {
	di as text "2SLS: outcome=it_cl4_cgt_cpi_diff4, ctrlvarstxt=survives, trimlevel=`trimlevel'"
	local header ""
	if (`firsttime') local header "header" //means we print the header only once
	directctrl2sls, outcome(it_cl4_cgt_cpi) ctrlvars(survives) trimlevel(`trimlevel') fh(resultsfh) `header' printtoscreen
	local firsttime = 0
	di c(current_date) ", " c(current_time)
}
file close resultsfh
restore



**********
*TABLE C4*
**********
* group income sources by above/below median autocorr, and then get results again 

preserve
use "${mergedata}\CouldhaveSample.dta", clear

*split into stable vs unstable income components
gen stable_inc = empinc_cpi+propinc_cpi+pensinc_cpi
gen unstable_inc = sempinc_cpi+divinc_cpi 

*normalise to 1 at audit year
foreach incvar of varlist  stable_inc unstable_inc {
	tempvar incvar_norm incvar_norm2 
	gen `incvar_norm' = `incvar' if yrssince20==20
	gegen `incvar_norm2' = max(`incvar_norm' ), by(utr_no)
	gen  `incvar'_norm = `incvar'/`incvar_norm2'  //normalise to 1 at individual level based on audit year 
}

cap file close resultsfh 
file open resultsfh using "${results}\TableC4.csv", write text replace 
foreach incvar of varlist stable_inc_norm unstable_inc_norm {
	directctrlreg, outcome(`incvar') ctrlvars(survives) trimlevel(1) fh(resultsfh) header printtoscreen
}
file close resultsfh 
restore



***********
*FIGURE C1*
***********
* compare difference in audit probabilities by re-running main spec, but with 'audited' as the outcome

preserve
use "${mergedata}\CouldhaveSample.dta", clear

*generate outcome variables: ever receive audit 
gen byte any_audit      = inlist(random_sig,"Y","N")
gen byte random_audit   = inlist(random_sig,"Y")
tempvar any_audit_tempyear random_audit_tempyear 
gen `any_audit_tempyear'= tax_year*any_audit 
egen any_audit_year     = max(`any_audit_tempyear'), by(utr_no)
gen byte audited_any    = any_audit_year==tax_year
gen `random_audit_tempyear'= tax_year*random_audit 
egen random_audit_year     = max(`random_audit_tempyear'), by(utr_no)
gen byte audited_random    = random_audit_year==tax_year

file open resultsfh using "${results}\FigureC1.csv", write text replace
local firsttime = 1
local trimlevel = 0 //can't trim a binary outcome 
foreach auditvar of varlist audited_random audited_any {
	di as text "OLS: outcome=`auditvar', ctrlvarstxt=`ctrlvarstxt', trimlevel=`trimlevel'"
	local header ""
	if (`firsttime') local header "header"
	directctrlreg, outcome(`auditvar') ctrlvars(survives) trimlevel(`trimlevel') fh(resultsfh) `header' printtoscreen
	local firsttime = 0
}
file close resultsfh 
restore



***********
*FIGURE C2*
***********
* Dynamic effect on total tax, based on predicted audit timing 

preserve
use "${mergedata}\CouldhaveSample.dta", clear

*convert audit open date to taxyear in which audit takes place
convert_date_to_taxyear enqry_start_date audityear_open

qui reg audityear_open audityear if random_sig == "Y" 
predict audityear_open_pred //here trying to predict audityear of opening directly 
replace audityear_open_pred = round(audityear_open_pred)

gen yrssince20_open_pred =  tax_year - audityear_open_pred + 20
ren (yrssince20) (yrssince20_orig)
ren (yrssince20_open_pred) (yrssince20) //directctrl2sls assumes yrssince20 exists and is the variable to use

file open resultsfh using "${results}\FigureC2.csv", write text replace
directctrl2sls, outcome(it_cl4_cgt_cpi) ctrlvars(survives) trimlevel(1) fh(resultsfh) header printtoscreen
file close resultsfh 
restore



***********
*FIGURE C3*
***********
*Dynamics by income source based on _predicted_ audit timing

preserve
use "${mergedata}\CouldhaveSample.dta", clear

*generate useful regressors
***************************
foreach sharevar in semp emp prop pens div int {
	tempvar `sharevar'_dm
	qui gen double sh_`sharevar' = (`sharevar'inc_cpi/totinc_cpi)
	qui su sh_`sharevar', meanonly
	gen double ``sharevar'_dm' = sh_`sharevar'-r(mean)
	replace sh_`sharevar' = ``sharevar'_dm'
}

*construct time to audit opening
********************************
*the bit below is a bit complicated as it deals with multiple audits - otherwise risk having time to open/close based on audit that relates to a different tax year 
tempvar filing_deadline audit_time_to_open_temp audit_time_to_open_min audit_time_to_open_min2 audit_time_to_open_max audit_time_to_open_max2 audit_time_to_open_final
gen `filing_deadline' = date("31/01/"+string(audityear+1),"DMY")
format `filing_deadline' %td
gen `audit_time_to_open_temp' = enqry_start_date - `filing_deadline' if random_sig=="Y"
gegen `audit_time_to_open_min' = min(`audit_time_to_open_temp') if `audit_time_to_open_temp'>0, by(utr_no)
gegen `audit_time_to_open_max' = max(`audit_time_to_open_temp') if `audit_time_to_open_temp'<0, by(utr_no)
replace `audit_time_to_open_max' = `audit_time_to_open_max' *-1
gegen `audit_time_to_open_max2' = min(`audit_time_to_open_max'), by(utr_no)
gegen `audit_time_to_open_min2' = min(`audit_time_to_open_min'), by(utr_no)
gegen `audit_time_to_open_final' = rowmin(`audit_time_to_open_min2' `audit_time_to_open_max2')
replace `audit_time_to_open_final' = `audit_time_to_open_final' if `audit_time_to_open_final'!=`audit_time_to_open_min'
gegen audit_time_to_open = max(`audit_time_to_open_final'), by(utr_no)

*small tail of these are pretty extreme, probably in error, so winsorise 
gquantiles audit_time_to_open, _pctile percentiles(99.5)
replace audit_time_to_open = r(r1) if audit_time_to_open > r(r1) & audit_time_to_open <.  

*predict time to audit opening
******************************
*predict based on tax year (as Fig C2) and share of reported income from various sources (since whole issue is that audits might be opened faster for ppl with different sources)
*	note: don't predict it separately by income source else would be being inconsistent within individual - could give different predictions for inc sources for same person, which wouldn't make sense
reg audit_time_to_open c.sh_semp##c.sh_semp c.sh_emp##c.sh_emp c.sh_prop##c.sh_prop c.sh_pens##c.sh_pens c.sh_div##c.sh_div c.sh_int##c.sh_int i.tax_year
predict days_to_open_predict
tempvar filing_deadline pred_open pred_open2 temp 
gen `filing_deadline' = date("31/01/"+string(audityear+1),"DMY")
gen `pred_open' = `filing_deadline'+days_to_open_predict 
convert_date_to_taxyear `pred_open' `pred_open2' 
gen `temp' = `pred_open2' - audityear
recode `temp' (6/.=6)
ren yrssince20 yrssince_orig
gen byte yrssince20 = tax_year - (audityear + `temp') + 20 //predicted audit opening
 	

*generate figure results 
************************
cap file close resultsfh 
	
file open resultsfh using "${results}\FigureC3.csv", write text replace 
foreach incvar of varlist empinc_cpi sempinc_cpi propinc_cpi pensinc_cpi divinc_cpi {
	directctrlreg, outcome(`incvar') ctrlvars(survives) trimlevel(1) fh(resultsfh) header printtoscreen
}
file close resultsfh 

*NOTE: for each income source, need to divide the results by the value at yrssince20==0 to get normalised values for figure



exit
